Use this to keep track of useful code bits as I learn Python Krista, August 19, 2015

Shortcut Action Shift-Enter run cell Ctrl-Enter run cell in-place Alt-Enter run cell, insert below


Ctrl / (Ctrl and then the slash)...will comment out any selected text within a block of code


In [ ]:
#First up...list the files in a directory
import os,sys
os.listdir(os.getcwd())

In [ ]:
#read the CSV file into a data frame and use the pandas head tool to show me the first five rows. 
#note that this doesn't seem to work: pd.head(CO_RawData)
CO_RawData=pd.read_csv(mtabFile, index_col='RInumber')
CO_RawData.head(n=5)

In [ ]:
#insert an image...the gif file here would be in the folder
from IPython.display import Image
Image(url="R02485.gif")

In [ ]:
for x in range(0, 3):
    print("hello")

In [ ]:
fig.suptitle(CO + ' working') #use the plus sign to concatenate strings for the title

In [ ]:
from IPython.core.debugger import Tracer #used this to step into the function and debug it, also need line with Tracer()() 
for i, CO in enumerate(CO_withKO):
    #if i==2:
         #break
    kos=CO_withKO[CO]['Related KO']
    cos=CO_withKO[CO]['Related CO']
    for k in kos: 
        if k in KO_RawData.index: 
            kData=KO_RawData.loc[kos].dropna()
            kData=(kData.T/kData.sum(axis=1)).T
            cData=CO_RawData.loc[cos].dropna()
            cData=(cData.T/cData.sum(axis=1)).T
            
            fig, ax=plt.subplots(1)
            kData.T.plot(color='r', ax=ax)
            cData.T.plot(color='k', ax=ax)
            
            Tracer()()
            
            getKmeans = CcoClust.loc['C01909']['kmeans']
            makeStringLabel = CO + '_kmeansCluster_' + str(getKmeans)
            #fig.suptitle(CO)
            fig.suptitle(makeStringLabel)
            
            #fig.savefig(CO+'.png') #stop saving all the images for now...
            break

In [7]:
#here, tData is a pandas data frame that I want to plot into a bar graph
#tData.plot(kind = "bar") ##this would be the code to run if tData existed...
#instead I am reading in the file saved and present in my working directory using this:
from IPython.display import Image
Image(filename="SampleBarGraph.png")


Out[7]:

In [ ]:
#indexing in Python is a bit bizarre, or at least takes some getting used to.
# df.ix[0,'cNumber'] #this will allow me to mix index from integers with index by label
#other way apparently uses iloc and loc, to use integers and labels respectively
# this would be df.iloc[0].loc['cNumber] {can't get that to work in the if statement}

In [8]:
#ways to subset data...
CcoClust.loc['C05356']['kmeans']
tData = CcoClust.loc['C05356']
type(tData)

#want to select only the first group in the kmeans clusters 
#(baby steps, eventually do this for each cluster)
CcoClust[CcoClust.kmeans==1]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-73c35b442f8f> in <module>()
----> 1 CcoClust.loc['C05356']['kmeans']

NameError: name 'CcoClust' is not defined

/...this is where I learned to not use pip install with scikit-learn... To upgrade scikit-learn: conda update scikit-learn


In [1]:
import sklearn.cluster
#from sklearn.cluster import KMeans

In [1]:
silAverage = [0.4227, 0.33299, 0.354, 0.3768, 0.3362, 0.3014, 0.3041, 0.307, 0.313, 0.325,
0.3109, 0.2999, 0.293, 0.289, 0.2938, 0.29, 0.288, 0.3, 0.287]

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

OK...can I get a simple scatter plot?


In [15]:
plt.scatter(range(0,len(silAverage)), silAverage)
plt.grid() #put on a grid

plt.xlim(-1,20)


Out[15]:
(-1, 20)

In [ ]:
#get list of column names in pandas data frame
list(my_dataframe.columns.values)

In [ ]:
for i in range(0,len(ut)):
    if i == 10:
        break
    p = ut.iloc[i,:]
    n = p.name
    if n[0] == 'R':
        #do the plotting, 
        #print 'yes'
        CO = p.KEGG
        kos = CO_withKO[CO]['Related KO']
        cos = CO_withKO[CO]['Related CO']
        #Tracer()()
        for k in kos: 
            if k in KO_RawData.index: 
                kData=KO_RawData.loc[kos].dropna()
                kData=(kData.T/kData.sum(axis=1)).T
                #? why RawData, the output from the K-means will have the normalized data, use that for CO 
                #bc easier since that is the file I am working with right now.
                #cData=CO_RawData.loc[cos].dropna()
                #cData=(cData.T/cData.sum(axis=1)).T
                cData = pd.DataFrame(p[dayList]).T
                
                #go back and check, but I think this next step is already done
                #cData=(cData.T/cData.sum(axis=1)).T

                fig, ax=plt.subplots(1)
                kData.T.plot(color='r', ax=ax)
                cData.T.plot(color='k', ax=ax)
                
    else:
        #skip over the KO plotting, so effectively doing nothing
        #print 'no'

Write a function to match RI number and cNumbers


In [ ]:
def findRInumber(dataIn,KEGGin):
    #find possible RI numbers for a given KEGG number. 
    for i,KEGG in enumerate(dataIn['KEGG']):
        if KEGG == KEGGin:
            t = dataIn.index[i]
            print t

#For example: this will give back one row, C18028 will be multiple
m = findRInumber(forRelatedness,'C00031') 
m

In [ ]:
#to copy a matrix I would think this works: NOPE
#forRelatedness = CcoClust# this is NOT making a new copy...
#instead it makes a new pointing to an existing data frame. So you now have two ways to 
#reference the same data frame. Make a change with one term and you can see the same change
#using the other name. Odd. No idea why you would want that.

In [ ]:
##this is the test that finally let me understand enumerate

# for index, KEGG in enumerate(useSmall['KEGG']):
#     print index,KEGG

In [ ]:
# Windows
chrome_path = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s'

url = "http://www.genome.jp/dbget-bin/www_bget?cpd:C00019"
webbrowser.get(chrome_path).open_new(url)
#while a nice idea, this stays open until you close the web browser window.

In [1]:
from IPython.display import HTML
tList = ['C02265','C00001']
for i in tList:
    ml = '<iframe src = http://www.genome.jp/dbget-bin/www_bget?cpd:' + i + ' width=700 height=350></iframe>'
    print ml


<iframe src = http://www.genome.jp/dbget-bin/www_bget?cpd:C02265 width=700 height=350></iframe>
<iframe src = http://www.genome.jp/dbget-bin/www_bget?cpd:C00001 width=700 height=350></iframe>

In [3]:
from IPython.display import HTML
CO='C02265'
HTML('<iframe src = http://www.genome.jp/dbget-bin/www_bget?cpd:' + CO + ' width=700 height=350></iframe>')


Out[3]:

In [ ]: